In [1]:
import pandas as pd
import seaborn as sns
from scipy.stats import zscore
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as p
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram, cut_tree
from ISLP.cluster import compute_linkage
from scipy.cluster.hierarchy import linkage
In [2]:
file_path = r"C:\Users\tiles\Downloads\causes-of-death-in-children-under-5.csv"
df = pd.read_csv(file_path)
df.head()
Out[2]:
| Entity | Code | Year | Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) | Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) | Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) | Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) | Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) | Deaths - Measles - Sex: Both - Age: Under 5 (Number) | Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) | ... | Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) | Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) | Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) | Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) | Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) | Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) | Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) | Deaths - Drowning - Sex: Both - Age: Under 5 (Number) | Deaths - Malaria - Sex: Both - Age: Under 5 (Number) | Deaths - Syphilis - Sex: Both - Age: Under 5 (Number) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 1990 | 48 | 105 | 1779 | 718 | 431 | 8649 | 477 | ... | 7112 | 2455 | 3968 | 131 | 802 | 808 | 10 | 776 | 21 | 123 |
| 1 | Afghanistan | AFG | 1991 | 55 | 130 | 1822 | 741 | 439 | 8669 | 495 | ... | 7574 | 2385 | 4650 | 129 | 781 | 800 | 12 | 748 | 41 | 132 |
| 2 | Afghanistan | AFG | 1992 | 68 | 155 | 2069 | 836 | 486 | 8539 | 554 | ... | 8614 | 2370 | 5833 | 137 | 821 | 863 | 13 | 777 | 51 | 180 |
| 3 | Afghanistan | AFG | 1993 | 78 | 178 | 2427 | 970 | 549 | 8949 | 630 | ... | 9458 | 2659 | 7800 | 155 | 923 | 979 | 16 | 872 | 24 | 239 |
| 4 | Afghanistan | AFG | 1994 | 83 | 194 | 2649 | 1063 | 589 | 10642 | 681 | ... | 9823 | 3187 | 7894 | 170 | 1015 | 1064 | 19 | 961 | 52 | 259 |
5 rows × 32 columns
In [3]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6840 entries, 0 to 6839 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Entity 6840 non-null object 1 Code 6150 non-null object 2 Year 6840 non-null int64 3 Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 4 Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 5 Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 6 Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 7 Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 8 Deaths - Measles - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 9 Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 10 Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 11 Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 12 Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 13 Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 14 Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 15 Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 16 Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 17 Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 18 Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 19 Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 20 Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 21 Deaths - Meningitis - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 22 Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 23 Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 24 Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 25 Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 26 Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 27 Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 28 Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 29 Deaths - Drowning - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 30 Deaths - Malaria - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 31 Deaths - Syphilis - Sex: Both - Age: Under 5 (Number) 6840 non-null int64 dtypes: int64(30), object(2) memory usage: 1.7+ MB
In [4]:
df.describe()
Out[4]:
| Year | Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) | Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) | Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) | Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) | Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) | Deaths - Measles - Sex: Both - Age: Under 5 (Number) | Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) | Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number) | Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number) | ... | Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) | Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) | Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) | Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) | Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) | Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) | Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) | Deaths - Drowning - Sex: Both - Age: Under 5 (Number) | Deaths - Malaria - Sex: Both - Age: Under 5 (Number) | Deaths - Syphilis - Sex: Both - Age: Under 5 (Number) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | ... | 6840.000000 | 6840.000000 | 6.840000e+03 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 | 6840.000000 |
| mean | 2004.500000 | 1041.740789 | 399.418567 | 6392.636842 | 826.106433 | 1472.511550 | 8527.408772 | 1235.561696 | 262.530409 | 304.512135 | ... | 11726.084795 | 3876.111696 | 2.464864e+04 | 535.363743 | 1718.877047 | 2892.662719 | 3252.680702 | 2331.720468 | 12045.209064 | 2107.161111 |
| std | 8.656074 | 5943.506061 | 1549.064285 | 30815.191076 | 4399.035288 | 5794.139457 | 43502.336767 | 5006.538348 | 1119.050195 | 1208.144730 | ... | 51612.890640 | 16817.425576 | 1.133132e+05 | 2156.814008 | 6934.211045 | 13333.898943 | 18169.939174 | 10832.408381 | 64858.902628 | 9180.674890 |
| min | 1990.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1997.000000 | 0.000000 | 1.000000 | 1.000000 | 0.000000 | 7.000000 | 0.000000 | 2.000000 | 0.000000 | 1.000000 | ... | 18.750000 | 0.000000 | 3.000000e+00 | 2.000000 | 4.000000 | 0.000000 | 0.000000 | 3.000000 | 0.000000 | 0.000000 |
| 50% | 2004.500000 | 1.000000 | 10.000000 | 17.000000 | 2.000000 | 44.000000 | 1.000000 | 25.000000 | 4.000000 | 7.000000 | ... | 178.000000 | 21.000000 | 9.800000e+01 | 15.000000 | 38.500000 | 11.000000 | 7.000000 | 27.000000 | 0.000000 | 12.000000 |
| 75% | 2012.000000 | 42.000000 | 77.000000 | 903.250000 | 36.000000 | 283.000000 | 655.000000 | 290.250000 | 52.000000 | 78.000000 | ... | 1773.250000 | 671.000000 | 3.822250e+03 | 142.000000 | 359.250000 | 388.000000 | 298.250000 | 291.250000 | 217.250000 | 277.250000 |
| max | 2019.000000 | 62334.000000 | 21223.000000 | 524103.000000 | 50184.000000 | 85197.000000 | 704288.000000 | 77952.000000 | 15916.000000 | 18047.000000 | ... | 539952.000000 | 240021.000000 | 1.649581e+06 | 35583.000000 | 115624.000000 | 209562.000000 | 223680.000000 | 184096.000000 | 631523.000000 | 99248.000000 |
8 rows × 30 columns
In [5]:
df.isnull().sum()
Out[5]:
Entity 0 Code 690 Year 0 Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) 0 Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) 0 Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) 0 Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) 0 Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) 0 Deaths - Measles - Sex: Both - Age: Under 5 (Number) 0 Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) 0 Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number) 0 Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number) 0 Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number) 0 Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number) 0 Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number) 0 Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number) 0 Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number) 0 Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number) 0 Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number) 0 Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number) 0 Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number) 0 Deaths - Meningitis - Sex: Both - Age: Under 5 (Number) 0 Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) 0 Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) 0 Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) 0 Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) 0 Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) 0 Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) 0 Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) 0 Deaths - Drowning - Sex: Both - Age: Under 5 (Number) 0 Deaths - Malaria - Sex: Both - Age: Under 5 (Number) 0 Deaths - Syphilis - Sex: Both - Age: Under 5 (Number) 0 dtype: int64
In [6]:
missing_code_entities_unique = df.loc[df['Code'].isnull(), 'Entity'].unique()
missing_code_entities_unique
Out[6]:
array(['African Region (WHO)', 'East Asia & Pacific (WB)',
'Eastern Mediterranean Region (WHO)', 'England',
'Europe & Central Asia (WB)', 'European Region (WHO)', 'G20',
'Latin America & Caribbean (WB)',
'Middle East & North Africa (WB)', 'North America (WB)',
'Northern Ireland', 'OECD Countries',
'Region of the Americas (WHO)', 'Scotland', 'South Asia (WB)',
'South-East Asia Region (WHO)', 'Sub-Saharan Africa (WB)', 'Wales',
'Western Pacific Region (WHO)', 'World Bank High Income',
'World Bank Low Income', 'World Bank Lower Middle Income',
'World Bank Upper Middle Income'], dtype=object)
In [7]:
df = df.dropna()
In [8]:
indices_to_drop = df[df['Entity'] == 'World'].index
df = df.drop(indices_to_drop)
In [9]:
df
Out[9]:
| Entity | Code | Year | Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number) | Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number) | Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number) | Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number) | Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number) | Deaths - Measles - Sex: Both - Age: Under 5 (Number) | Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number) | ... | Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number) | Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number) | Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number) | Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number) | Deaths - Road injuries - Sex: Both - Age: Under 5 (Number) | Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number) | Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number) | Deaths - Drowning - Sex: Both - Age: Under 5 (Number) | Deaths - Malaria - Sex: Both - Age: Under 5 (Number) | Deaths - Syphilis - Sex: Both - Age: Under 5 (Number) | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Afghanistan | AFG | 1990 | 48 | 105 | 1779 | 718 | 431 | 8649 | 477 | ... | 7112 | 2455 | 3968 | 131 | 802 | 808 | 10 | 776 | 21 | 123 |
| 1 | Afghanistan | AFG | 1991 | 55 | 130 | 1822 | 741 | 439 | 8669 | 495 | ... | 7574 | 2385 | 4650 | 129 | 781 | 800 | 12 | 748 | 41 | 132 |
| 2 | Afghanistan | AFG | 1992 | 68 | 155 | 2069 | 836 | 486 | 8539 | 554 | ... | 8614 | 2370 | 5833 | 137 | 821 | 863 | 13 | 777 | 51 | 180 |
| 3 | Afghanistan | AFG | 1993 | 78 | 178 | 2427 | 970 | 549 | 8949 | 630 | ... | 9458 | 2659 | 7800 | 155 | 923 | 979 | 16 | 872 | 24 | 239 |
| 4 | Afghanistan | AFG | 1994 | 83 | 194 | 2649 | 1063 | 589 | 10642 | 681 | ... | 9823 | 3187 | 7894 | 170 | 1015 | 1064 | 19 | 961 | 52 | 259 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6835 | Zimbabwe | ZWE | 2015 | 106 | 31 | 1733 | 17 | 56 | 615 | 92 | ... | 2269 | 518 | 1345 | 114 | 115 | 799 | 2178 | 126 | 1475 | 399 |
| 6836 | Zimbabwe | ZWE | 2016 | 112 | 32 | 1771 | 18 | 58 | 369 | 95 | ... | 2249 | 559 | 1286 | 119 | 120 | 787 | 1827 | 133 | 1219 | 398 |
| 6837 | Zimbabwe | ZWE | 2017 | 111 | 32 | 1714 | 17 | 58 | 261 | 94 | ... | 2245 | 544 | 1248 | 117 | 119 | 745 | 1658 | 133 | 1249 | 394 |
| 6838 | Zimbabwe | ZWE | 2018 | 109 | 31 | 1639 | 16 | 58 | 340 | 91 | ... | 2203 | 568 | 1136 | 114 | 115 | 693 | 1458 | 129 | 1213 | 397 |
| 6839 | Zimbabwe | ZWE | 2019 | 108 | 31 | 1598 | 15 | 57 | 349 | 89 | ... | 2190 | 536 | 1067 | 112 | 112 | 661 | 1394 | 127 | 1207 | 413 |
6120 rows × 32 columns
In [10]:
rename_dict = {
'Deaths - Invasive Non-typhoidal Salmonella (iNTS) - Sex: Both - Age: Under 5 (Number)': 'INTS_Deaths',
'Deaths - Interpersonal violence - Sex: Both - Age: Under 5 (Number)': 'Violence_Deaths',
'Deaths - Nutritional deficiencies - Sex: Both - Age: Under 5 (Number)': 'Nutrition_Deaths',
'Deaths - Acute hepatitis - Sex: Both - Age: Under 5 (Number)': 'Hepatitis_Deaths',
'Deaths - Neoplasms - Sex: Both - Age: Under 5 (Number)': 'Neoplasms_Deaths',
'Deaths - Measles - Sex: Both - Age: Under 5 (Number)': 'Measles_Deaths',
'Deaths - Digestive diseases - Sex: Both - Age: Under 5 (Number)': 'Digestive_Deaths',
'Deaths - Cirrhosis and other chronic liver diseases - Sex: Both - Age: Under 5 (Number)': 'Cirrhosis_Deaths',
'Deaths - Chronic kidney disease - Sex: Both - Age: Under 5 (Number)': 'Kidney_Deaths',
'Deaths - Cardiovascular diseases - Sex: Both - Age: Under 5 (Number)': 'Cardiovascular_Deaths',
'Deaths - Congenital birth defects - Sex: Both - Age: Under 5 (Number)': 'Congenital_Deaths',
'Deaths - Lower respiratory infections - Sex: Both - Age: Under 5 (Number)': 'Respiratory_Deaths',
'Deaths - Neonatal preterm birth - Sex: Both - Age: Under 5 (Number)': 'Preterm_Deaths',
'Deaths - Environmental heat and cold exposure - Sex: Both - Age: Under 5 (Number)': 'Heat_Cold_Deaths',
'Deaths - Neonatal sepsis and other neonatal infections - Sex: Both - Age: Under 5 (Number)': 'Sepsis_Deaths',
'Deaths - Exposure to forces of nature - Sex: Both - Age: Under 5 (Number)': 'Nature_Deaths',
'Deaths - Diabetes mellitus - Sex: Both - Age: Under 5 (Number)': 'Diabetes_Deaths',
'Deaths - Neonatal encephalopathy due to birth asphyxia and trauma - Sex: Both - Age: Under 5 (Number)': 'Encephalopathy_Deaths',
'Deaths - Meningitis - Sex: Both - Age: Under 5 (Number)': 'Meningitis_Deaths',
'Deaths - Other neonatal disorders - Sex: Both - Age: Under 5 (Number)': 'Other_Neonatal_Deaths',
'Deaths - Whooping cough - Sex: Both - Age: Under 5 (Number)': 'Whooping_Cough_Deaths',
'Deaths - Diarrheal diseases - Sex: Both - Age: Under 5 (Number)': 'Diarrheal_Deaths',
'Deaths - Fire, heat, and hot substances - Sex: Both - Age: Under 5 (Number)': 'Fire_Heat_Deaths',
'Deaths - Road injuries - Sex: Both - Age: Under 5 (Number)': 'Road_Deaths',
'Deaths - Tuberculosis - Sex: Both - Age: Under 5 (Number)': 'Tuberculosis_Deaths',
'Deaths - HIV/AIDS - Sex: Both - Age: Under 5 (Number)': 'HIV_AIDS_Deaths',
'Deaths - Drowning - Sex: Both - Age: Under 5 (Number)': 'Drowning_Deaths',
'Deaths - Malaria - Sex: Both - Age: Under 5 (Number)': 'Malaria_Deaths',
'Deaths - Syphilis - Sex: Both - Age: Under 5 (Number)': 'Syphilis_Deaths'
}
df = df.rename(columns=rename_dict)
print(df.columns)
Index(['Entity', 'Code', 'Year', 'INTS_Deaths', 'Violence_Deaths',
'Nutrition_Deaths', 'Hepatitis_Deaths', 'Neoplasms_Deaths',
'Measles_Deaths', 'Digestive_Deaths', 'Cirrhosis_Deaths',
'Kidney_Deaths', 'Cardiovascular_Deaths', 'Congenital_Deaths',
'Respiratory_Deaths', 'Preterm_Deaths', 'Heat_Cold_Deaths',
'Sepsis_Deaths', 'Nature_Deaths', 'Diabetes_Deaths',
'Encephalopathy_Deaths', 'Meningitis_Deaths', 'Other_Neonatal_Deaths',
'Whooping_Cough_Deaths', 'Diarrheal_Deaths', 'Fire_Heat_Deaths',
'Road_Deaths', 'Tuberculosis_Deaths', 'HIV_AIDS_Deaths',
'Drowning_Deaths', 'Malaria_Deaths', 'Syphilis_Deaths'],
dtype='object')
In [11]:
cols = df.drop(columns=['Entity', 'Code', 'Year']).select_dtypes(include=np.number)
z_scores = cols.apply(zscore)
outlier_threshold = 3
outliers = (z_scores > outlier_threshold) | (z_scores < -outlier_threshold)
outlier_counts = outliers.sum()
print("Columns with outlier counts:")
print(outlier_counts[outlier_counts > 0])
Columns with outlier counts: INTS_Deaths 38 Violence_Deaths 78 Nutrition_Deaths 55 Hepatitis_Deaths 33 Neoplasms_Deaths 70 Measles_Deaths 74 Digestive_Deaths 102 Cirrhosis_Deaths 78 Kidney_Deaths 141 Cardiovascular_Deaths 96 Congenital_Deaths 85 Respiratory_Deaths 73 Preterm_Deaths 58 Heat_Cold_Deaths 43 Sepsis_Deaths 75 Nature_Deaths 13 Diabetes_Deaths 177 Encephalopathy_Deaths 106 Meningitis_Deaths 97 Other_Neonatal_Deaths 48 Whooping_Cough_Deaths 83 Diarrheal_Deaths 57 Fire_Heat_Deaths 81 Road_Deaths 133 Tuberculosis_Deaths 125 HIV_AIDS_Deaths 165 Drowning_Deaths 68 Malaria_Deaths 82 Syphilis_Deaths 136 dtype: int64
In [12]:
filtered_data = df[(df['Year'] >= 1990) & (df['Year'] <= 2019)]
aggregated_data = filtered_data.groupby('Entity').sum()
aggregated_data = aggregated_data.drop(columns=['Year'])
aggregated_data
C:\Users\tiles\AppData\Local\Temp\ipykernel_24540\2972503105.py:2: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
aggregated_data = filtered_data.groupby('Entity').sum()
Out[12]:
| INTS_Deaths | Violence_Deaths | Nutrition_Deaths | Hepatitis_Deaths | Neoplasms_Deaths | Measles_Deaths | Digestive_Deaths | Cirrhosis_Deaths | Kidney_Deaths | Cardiovascular_Deaths | ... | Other_Neonatal_Deaths | Whooping_Cough_Deaths | Diarrheal_Deaths | Fire_Heat_Deaths | Road_Deaths | Tuberculosis_Deaths | HIV_AIDS_Deaths | Drowning_Deaths | Malaria_Deaths | Syphilis_Deaths | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Entity | |||||||||||||||||||||
| Afghanistan | 4355 | 6307 | 58382 | 23437 | 17974 | 199342 | 20764 | 7046 | 8904 | 8038 | ... | 322226 | 107240 | 236890 | 4528 | 26172 | 23411 | 1185 | 25157 | 3310 | 8450 |
| Albania | 0 | 85 | 457 | 13 | 798 | 196 | 1357 | 127 | 143 | 1073 | ... | 7584 | 258 | 505 | 142 | 324 | 29 | 0 | 426 | 0 | 150 |
| Algeria | 422 | 392 | 3812 | 1110 | 4393 | 20391 | 5265 | 1884 | 2848 | 10669 | ... | 78084 | 8150 | 26856 | 3659 | 26803 | 959 | 1158 | 5173 | 0 | 4200 |
| American Samoa | 0 | 0 | 17 | 0 | 0 | 33 | 0 | 0 | 0 | 0 | ... | 46 | 22 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | 55 |
| Andorra | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| Venezuela | 0 | 2990 | 12437 | 129 | 4801 | 62 | 3363 | 439 | 716 | 1058 | ... | 9828 | 686 | 38038 | 756 | 5376 | 755 | 1534 | 3369 | 523 | 432 |
| Vietnam | 928 | 1055 | 9295 | 667 | 11434 | 84565 | 10139 | 1777 | 2849 | 12965 | ... | 27792 | 33658 | 38757 | 3021 | 9956 | 10835 | 2072 | 65895 | 1645 | 9328 |
| Yemen | 1924 | 1201 | 52547 | 6388 | 8405 | 100086 | 10464 | 3756 | 3108 | 19717 | ... | 252675 | 32201 | 397881 | 7836 | 54648 | 5338 | 1759 | 12282 | 7081 | 10606 |
| Zambia | 1503 | 2487 | 68820 | 1015 | 13625 | 57746 | 8670 | 1752 | 2269 | 5779 | ... | 59476 | 27087 | 230238 | 4001 | 8373 | 25753 | 216174 | 6805 | 124161 | 38809 |
| Zimbabwe | 2005 | 771 | 33936 | 334 | 1183 | 28844 | 2051 | 194 | 531 | 3908 | ... | 62915 | 12635 | 47178 | 2267 | 2200 | 17075 | 236368 | 2297 | 56942 | 11729 |
204 rows × 29 columns
In [13]:
total_deaths_per_entity = df.groupby('Entity').sum().sum(axis=1)
top_50_countries = total_deaths_per_entity.sort_values(ascending=False).head(50)
colors = plt.cm.cool(np.linspace(0, 1, len(top_50_countries)))
plt.figure(figsize=(12, 8))
top_50_countries.plot(kind='bar', color=colors)
plt.title('Top 50 Countries by Total Number of Deaths')
plt.xlabel('Country')
plt.ylabel('Total Number of Deaths')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
C:\Users\tiles\AppData\Local\Temp\ipykernel_24540\11496973.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
total_deaths_per_entity = df.groupby('Entity').sum().sum(axis=1)
In [14]:
causes_of_death = df.drop(columns=['Entity', 'Code', 'Year']).sum()
top_20_causes = causes_of_death.sort_values(ascending=False).head(20)
colors = plt.cm.cool(np.linspace(0, 1, len(top_20_causes)))
plt.figure(figsize=(12, 8))
top_20_causes.plot(kind='bar', color=colors)
plt.title('Top 20 Causes of Death')
plt.xlabel('Cause of Death')
plt.ylabel('Total Number of Deaths')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()
In [15]:
df_map = df.copy()
df_map['Total_Deaths'] = df_map.drop(columns=['Entity', 'Code', 'Year']).sum(axis=1)
yearly_deaths = df_map.groupby(['Entity', 'Year'])['Total_Deaths'].sum().reset_index()
fig = px.choropleth(yearly_deaths,
locations="Entity",
locationmode='country names',
color="Total_Deaths",
hover_name="Entity",
animation_frame="Year",
color_continuous_scale='Reds',
title="Total Deaths from 1990 to 2019")
fig.update_layout(
geo=dict(showframe=False, showcoastlines=False, projection_type='equirectangular'),
title=dict(x=0.5)
)
fig.show()
In [16]:
import matplotlib.pyplot as plt
df_cause = df.drop(columns=['Entity', 'Code'])
df_yearly = df_cause.groupby('Year').sum().reset_index()
if 'Total_Deaths' in df_yearly.columns:
df_yearly = df_yearly.drop(columns='Total_Deaths')
num_causes = len(df_yearly.columns) - 1
colors = plt.cm.tab10(np.linspace(0, 1, num_causes))
plt.figure(figsize=(12, 8))
for i, c in enumerate(df_yearly.columns[1:], start=1):
plt.plot(df_yearly['Year'], df_yearly[c], label=c, color=colors[i % num_causes])
plt.title('Deaths by Disease from 1990 to 2019')
plt.xlabel('Year')
plt.ylabel('Number of Deaths')
plt.legend(loc='upper left', bbox_to_anchor=(1, 1), ncol=1)
plt.tight_layout()
plt.show()
Models¶
In [17]:
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df.drop(columns=['Entity', 'Code', 'Year']))
pca = PCA()
pca_out = pca.fit_transform(df_scaled)
In [18]:
print(pd.DataFrame({'Center': scaler.mean_, 'Scale': scaler.scale_}, index=df.columns.drop(['Entity', 'Code', 'Year'])))
Center Scale INTS_Deaths 231.492320 1622.368214 Violence_Deaths 79.825817 303.457690 Nutrition_Deaths 1333.719118 6972.497881 Hepatitis_Deaths 160.559967 1616.560333 Neoplasms_Deaths 297.492974 1204.333079 Measles_Deaths 1814.687092 8421.696247 Digestive_Deaths 253.901797 938.604887 Cirrhosis_Deaths 54.085621 238.815144 Kidney_Deaths 63.699020 180.199557 Cardiovascular_Deaths 236.582190 898.611133 Congenital_Deaths 3175.634477 10591.726971 Respiratory_Deaths 6722.154902 30161.351947 Preterm_Deaths 4808.332026 20841.947352 Heat_Cold_Deaths 17.039706 123.145252 Sepsis_Deaths 1271.867157 4650.791920 Nature_Deaths 24.220098 505.365350 Diabetes_Deaths 16.040850 44.805750 Encephalopathy_Deaths 3647.752614 15066.581181 Meningitis_Deaths 1019.708007 4083.534715 Other_Neonatal_Deaths 2374.598366 14282.461832 Whooping_Cough_Deaths 807.175163 3630.187080 Diarrheal_Deaths 5213.969771 22619.833294 Fire_Heat_Deaths 111.063399 358.777053 Road_Deaths 352.145588 1244.571219 Tuberculosis_Deaths 611.118464 2500.691503 HIV_AIDS_Deaths 707.966013 2634.357974 Drowning_Deaths 469.680556 2781.121195 Malaria_Deaths 2654.809150 12169.034911 Syphilis_Deaths 448.185294 1475.692709
In [19]:
print("Number of Principal Components:", pca.n_components_)
Number of Principal Components: 29
In [20]:
components_df = pd.DataFrame(pca.components_.T, index=df.columns.drop(['Entity', 'Code', 'Year']), columns=[f'PC{i+1}' for i in range(pca.n_components_)])
print(components_df)
PC1 PC2 PC3 PC4 PC5 \
INTS_Deaths 0.094227 -0.279744 0.463189 -0.097988 -0.311449
Violence_Deaths 0.192083 0.222668 0.146339 -0.055789 -0.035394
Nutrition_Deaths 0.185846 -0.098552 -0.229165 0.030813 0.090948
Hepatitis_Deaths 0.181262 -0.071792 -0.355795 -0.059059 -0.015808
Neoplasms_Deaths 0.189168 0.308176 0.059227 -0.001661 0.086901
Measles_Deaths 0.190922 -0.195315 -0.058128 0.001188 -0.004194
Digestive_Deaths 0.204316 0.154755 0.155294 -0.032092 -0.091682
Cirrhosis_Deaths 0.207761 -0.142255 -0.120225 -0.020686 -0.116311
Kidney_Deaths 0.203901 0.060996 0.144088 0.068227 -0.049469
Cardiovascular_Deaths 0.153002 0.310854 0.170639 -0.037460 -0.036617
Congenital_Deaths 0.210769 0.182265 0.024624 -0.018932 -0.016313
Respiratory_Deaths 0.221859 0.003205 -0.048193 -0.041515 -0.054279
Preterm_Deaths 0.212775 -0.004188 -0.175389 -0.018307 -0.026728
Heat_Cold_Deaths 0.201447 0.150567 -0.138287 -0.088340 -0.017715
Sepsis_Deaths 0.204937 -0.181966 -0.100790 0.021038 0.033891
Nature_Deaths 0.021530 0.025108 -0.033360 0.933804 -0.320722
Diabetes_Deaths 0.151425 0.185858 0.211426 0.150184 0.291552
Encephalopathy_Deaths 0.205653 -0.058167 -0.058967 0.010093 0.001525
Meningitis_Deaths 0.200081 -0.191567 0.151517 -0.053601 -0.146360
Other_Neonatal_Deaths 0.186166 -0.095823 -0.326120 -0.005729 0.037272
Whooping_Cough_Deaths 0.216636 -0.095771 -0.121720 -0.004722 0.002849
Diarrheal_Deaths 0.204768 -0.223387 0.033048 -0.042289 -0.123416
Fire_Heat_Deaths 0.218313 0.060347 0.051362 -0.028369 -0.029486
Road_Deaths 0.182034 0.251374 0.167682 -0.015125 -0.015366
Tuberculosis_Deaths 0.205832 -0.118289 -0.014162 0.037863 0.044409
HIV_AIDS_Deaths 0.067126 -0.195497 0.223282 0.218046 0.743423
Drowning_Deaths 0.178385 0.319090 -0.047663 0.013700 0.006005
Malaria_Deaths 0.129861 -0.318591 0.386044 -0.031780 -0.070264
Syphilis_Deaths 0.193933 -0.157282 -0.001519 0.118707 0.260574
PC6 PC7 PC8 PC9 PC10 ... \
INTS_Deaths -0.150491 -0.160460 -0.168650 0.019196 0.068765 ...
Violence_Deaths -0.211198 -0.073727 -0.162915 -0.251523 0.142197 ...
Nutrition_Deaths 0.162417 0.419810 -0.199878 -0.001383 0.167245 ...
Hepatitis_Deaths -0.181882 -0.073030 -0.109682 0.225578 0.130812 ...
Neoplasms_Deaths -0.085367 0.004181 0.014093 -0.241094 -0.110874 ...
Measles_Deaths 0.131766 0.354851 0.323575 -0.326737 0.191139 ...
Digestive_Deaths -0.102466 -0.013309 -0.087361 -0.031306 -0.249631 ...
Cirrhosis_Deaths -0.004083 -0.084571 -0.035769 0.175154 -0.093034 ...
Kidney_Deaths 0.238568 -0.130608 0.174932 -0.108956 -0.115535 ...
Cardiovascular_Deaths -0.069920 0.144762 0.567575 0.467288 0.356480 ...
Congenital_Deaths -0.090481 -0.163793 -0.011526 0.023456 -0.043899 ...
Respiratory_Deaths -0.076192 0.086761 -0.050587 -0.051147 0.123885 ...
Preterm_Deaths -0.084886 -0.221464 0.010628 0.092853 -0.044653 ...
Heat_Cold_Deaths -0.275361 0.125037 -0.239183 0.012007 0.136205 ...
Sepsis_Deaths 0.034137 -0.287823 -0.004858 0.011011 0.111145 ...
Nature_Deaths -0.124072 0.033323 -0.017946 0.017272 0.030310 ...
Diabetes_Deaths 0.524986 -0.306077 -0.212995 0.026004 0.468122 ...
Encephalopathy_Deaths 0.027057 -0.265309 0.344594 -0.302319 -0.190070 ...
Meningitis_Deaths -0.045498 -0.004897 0.197662 -0.228545 0.005492 ...
Other_Neonatal_Deaths -0.070486 -0.265279 0.060426 0.175507 0.013000 ...
Whooping_Cough_Deaths -0.000840 0.023613 -0.063161 0.040207 0.011448 ...
Diarrheal_Deaths -0.003502 0.130329 0.078196 -0.001123 0.121607 ...
Fire_Heat_Deaths -0.076153 0.056770 -0.007674 -0.077277 0.032710 ...
Road_Deaths 0.066890 0.188621 0.020785 0.382679 -0.417345 ...
Tuberculosis_Deaths 0.300646 0.298881 -0.041852 0.027531 -0.235244 ...
HIV_AIDS_Deaths -0.472547 0.074565 0.070075 -0.014049 -0.019501 ...
Drowning_Deaths 0.021078 0.156610 -0.273636 -0.205253 -0.059690 ...
Malaria_Deaths 0.005307 0.110629 -0.258285 0.230162 0.052138 ...
Syphilis_Deaths 0.234327 -0.102497 -0.021082 0.145453 -0.333198 ...
PC20 PC21 PC22 PC23 PC24 \
INTS_Deaths 0.097433 0.001448 -0.074810 -0.094545 -0.097994
Violence_Deaths -0.173820 -0.033710 -0.050523 0.083480 0.103140
Nutrition_Deaths -0.127458 -0.044478 0.009286 0.094013 -0.186105
Hepatitis_Deaths -0.236619 -0.216256 -0.077713 0.303019 0.177520
Neoplasms_Deaths -0.092602 0.080114 -0.037015 -0.206970 0.142890
Measles_Deaths 0.053259 -0.115713 -0.123097 -0.015506 -0.142276
Digestive_Deaths -0.055170 -0.379395 0.051284 -0.150414 -0.219953
Cirrhosis_Deaths -0.092435 -0.357556 -0.195848 -0.003489 0.239556
Kidney_Deaths -0.321186 0.343685 -0.127534 -0.012511 -0.199787
Cardiovascular_Deaths 0.055162 -0.101249 0.098490 -0.050023 0.073617
Congenital_Deaths -0.166603 0.104662 0.302517 -0.228667 0.095590
Respiratory_Deaths -0.110788 0.086398 -0.053321 -0.144675 -0.227351
Preterm_Deaths 0.108676 -0.137703 0.254089 -0.071234 -0.148213
Heat_Cold_Deaths 0.035002 0.207158 -0.148331 -0.029819 -0.384965
Sepsis_Deaths 0.647155 0.229967 -0.002104 0.026460 -0.036458
Nature_Deaths 0.001507 -0.001788 0.001029 0.002632 -0.003212
Diabetes_Deaths -0.042636 -0.117215 -0.058150 0.066593 -0.010476
Encephalopathy_Deaths 0.009330 -0.222256 -0.415252 0.056801 0.066756
Meningitis_Deaths -0.115091 0.092793 0.559466 0.541645 -0.002258
Other_Neonatal_Deaths -0.085579 0.118892 0.098032 -0.148898 -0.244146
Whooping_Cough_Deaths -0.121116 0.456805 -0.059503 -0.087762 0.564668
Diarrheal_Deaths 0.047733 0.118182 -0.224383 -0.040080 0.085454
Fire_Heat_Deaths 0.321166 -0.138664 0.057472 -0.110696 0.131770
Road_Deaths 0.150643 0.216489 -0.267263 0.347573 -0.100635
Tuberculosis_Deaths 0.081210 -0.069895 0.275319 -0.421664 0.135779
HIV_AIDS_Deaths 0.004064 -0.001969 -0.019463 0.000040 0.015408
Drowning_Deaths 0.311309 -0.091829 0.079973 0.269858 0.182744
Malaria_Deaths -0.119786 -0.071422 -0.039554 -0.015174 0.069500
Syphilis_Deaths -0.061982 -0.062689 0.100664 0.136883 -0.121650
PC25 PC26 PC27 PC28 PC29
INTS_Deaths -0.088373 0.044640 -0.002440 -0.108274 -0.045844
Violence_Deaths 0.057892 0.050430 0.056767 0.021887 0.028849
Nutrition_Deaths 0.060651 -0.031770 -0.038811 -0.111907 -0.029652
Hepatitis_Deaths -0.400324 -0.138143 -0.279816 -0.366895 0.054902
Neoplasms_Deaths -0.388595 0.351079 0.316007 -0.234096 0.146334
Measles_Deaths 0.086526 0.055086 0.052071 0.008244 -0.066508
Digestive_Deaths 0.307334 0.024007 -0.262471 -0.292995 -0.095235
Cirrhosis_Deaths -0.147981 0.235191 0.326707 0.486375 -0.074721
Kidney_Deaths -0.228333 -0.123653 0.026343 -0.201089 -0.040781
Cardiovascular_Deaths -0.002784 -0.016565 0.019231 0.005018 -0.027578
Congenital_Deaths -0.059441 -0.270984 -0.333200 0.332474 -0.250917
Respiratory_Deaths 0.009130 0.014471 -0.242095 0.348853 0.754397
Preterm_Deaths 0.220018 -0.408043 0.585638 -0.214154 0.257328
Heat_Cold_Deaths -0.105954 -0.203076 0.198300 0.237287 -0.404382
Sepsis_Deaths -0.236459 -0.000124 -0.138853 -0.011869 0.045211
Nature_Deaths -0.004240 0.002690 -0.001426 0.004114 -0.000293
Diabetes_Deaths 0.077620 0.030078 0.022936 0.017752 -0.040112
Encephalopathy_Deaths 0.131748 -0.218342 -0.136616 0.111867 -0.068003
Meningitis_Deaths -0.029798 0.122174 0.048276 0.108004 -0.035973
Other_Neonatal_Deaths 0.265599 0.635336 -0.080919 -0.078857 -0.164563
Whooping_Cough_Deaths 0.440254 -0.084923 0.022981 -0.074209 0.032438
Diarrheal_Deaths 0.022970 -0.035097 0.047334 -0.171974 -0.151255
Fire_Heat_Deaths -0.004262 0.065565 -0.176340 -0.107244 0.071756
Road_Deaths 0.066116 0.072285 -0.019305 0.017310 0.088178
Tuberculosis_Deaths -0.252701 -0.057748 -0.033566 -0.002233 -0.090820
HIV_AIDS_Deaths -0.001889 -0.011943 -0.000415 0.002573 -0.014906
Drowning_Deaths 0.118622 0.073175 -0.012029 0.047145 -0.063032
Malaria_Deaths 0.074032 0.002485 0.052472 0.004714 0.040699
Syphilis_Deaths -0.041984 -0.047687 0.012801 0.092254 0.028923
[29 rows x 29 columns]
In [24]:
plt.figure(figsize=(14, 10))
for i in range(28):
plt.scatter(pca_out[:, i], pca_out[:, i + 1], label=f'PC{i+1} vs PC{i+2}', s=5)
plt.xlabel('Principal Component')
plt.ylabel('Principal Component')
plt.title('Scatter Plot of the First 29 Principal Components')
plt.legend()
plt.show()
In [25]:
print("Explained Variance:", pca.explained_variance_)
Explained Variance: [1.97196643e+01 2.45048380e+00 2.19999567e+00 1.00752567e+00 9.58763219e-01 5.18799377e-01 4.43362442e-01 3.55857051e-01 3.02347592e-01 2.21220670e-01 2.04948758e-01 1.25738712e-01 1.04572661e-01 8.55214733e-02 5.88939451e-02 4.80105053e-02 4.28070082e-02 3.57672799e-02 2.20169171e-02 2.13706380e-02 1.66299190e-02 1.26423574e-02 1.15981840e-02 9.93009277e-03 7.62041773e-03 5.67412152e-03 5.28917665e-03 4.89361606e-03 2.79377675e-03]
In [26]:
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
Explained Variance Ratio: [6.79877314e-01 8.44856343e-02 7.58495239e-02 3.47365877e-02 3.30553985e-02 1.78867106e-02 1.52858620e-02 1.22689277e-02 1.04240755e-02 7.62705251e-03 7.06604378e-03 4.33510917e-03 3.60536462e-03 2.94853446e-03 2.03049386e-03 1.65526415e-03 1.47586254e-03 1.23315295e-03 7.59079984e-04 7.36798139e-04 5.73351781e-04 4.35872127e-04 3.99872030e-04 3.42361042e-04 2.62730088e-04 1.95627392e-04 1.82355600e-04 1.68717809e-04 9.63213881e-05]
In [27]:
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(pca.explained_variance_ratio_) + 1), pca.explained_variance_ratio_, marker='o')
plt.xlabel('Principal Component')
plt.ylabel('Proportion of Variance Explained')
plt.title('Proportion of Variance Explained by Principal Components')
plt.grid(True)
plt.show()
In [28]:
cumulative_explained_variance = np.cumsum(pca.explained_variance_ratio_)
plt.figure(figsize=(10, 5))
plt.plot(range(1, len(cumulative_explained_variance) + 1), cumulative_explained_variance, marker='o')
plt.xlabel('Number of Principal Components')
plt.ylabel('Cumulative Proportion of Variance Explained')
plt.title('Cumulative Proportion of Variance Explained by Principal Components')
plt.grid(True)
plt.show()
In [29]:
pca = PCA(n_components=7)
pca.fit(df_scaled)
components_df = pd.DataFrame(pca.components_.T, index=df.columns.drop(['Entity', 'Code', 'Year']), columns=[f'PC{i+1}' for i in range(pca.n_components_)])
print(components_df)
pca_out = pca.transform(df_scaled)
PC1 PC2 PC3 PC4 PC5 \
INTS_Deaths 0.094227 -0.279744 0.463189 -0.097988 -0.311449
Violence_Deaths 0.192083 0.222668 0.146339 -0.055789 -0.035394
Nutrition_Deaths 0.185846 -0.098552 -0.229165 0.030813 0.090948
Hepatitis_Deaths 0.181262 -0.071792 -0.355795 -0.059059 -0.015808
Neoplasms_Deaths 0.189168 0.308176 0.059227 -0.001661 0.086901
Measles_Deaths 0.190922 -0.195315 -0.058128 0.001188 -0.004194
Digestive_Deaths 0.204316 0.154755 0.155294 -0.032092 -0.091682
Cirrhosis_Deaths 0.207761 -0.142255 -0.120225 -0.020686 -0.116311
Kidney_Deaths 0.203901 0.060996 0.144088 0.068227 -0.049469
Cardiovascular_Deaths 0.153002 0.310854 0.170639 -0.037460 -0.036617
Congenital_Deaths 0.210769 0.182265 0.024624 -0.018932 -0.016313
Respiratory_Deaths 0.221859 0.003205 -0.048193 -0.041515 -0.054279
Preterm_Deaths 0.212775 -0.004188 -0.175389 -0.018307 -0.026728
Heat_Cold_Deaths 0.201447 0.150567 -0.138287 -0.088340 -0.017715
Sepsis_Deaths 0.204937 -0.181966 -0.100790 0.021038 0.033891
Nature_Deaths 0.021530 0.025108 -0.033360 0.933804 -0.320722
Diabetes_Deaths 0.151425 0.185858 0.211426 0.150184 0.291552
Encephalopathy_Deaths 0.205653 -0.058167 -0.058967 0.010093 0.001525
Meningitis_Deaths 0.200081 -0.191567 0.151517 -0.053601 -0.146360
Other_Neonatal_Deaths 0.186166 -0.095823 -0.326120 -0.005729 0.037272
Whooping_Cough_Deaths 0.216636 -0.095771 -0.121720 -0.004722 0.002849
Diarrheal_Deaths 0.204768 -0.223387 0.033048 -0.042289 -0.123416
Fire_Heat_Deaths 0.218313 0.060347 0.051362 -0.028369 -0.029486
Road_Deaths 0.182034 0.251374 0.167682 -0.015125 -0.015366
Tuberculosis_Deaths 0.205832 -0.118289 -0.014162 0.037863 0.044409
HIV_AIDS_Deaths 0.067126 -0.195497 0.223282 0.218046 0.743423
Drowning_Deaths 0.178385 0.319090 -0.047663 0.013700 0.006005
Malaria_Deaths 0.129861 -0.318591 0.386044 -0.031780 -0.070264
Syphilis_Deaths 0.193933 -0.157282 -0.001519 0.118707 0.260574
PC6 PC7
INTS_Deaths -0.150491 -0.160460
Violence_Deaths -0.211198 -0.073727
Nutrition_Deaths 0.162417 0.419810
Hepatitis_Deaths -0.181882 -0.073030
Neoplasms_Deaths -0.085367 0.004181
Measles_Deaths 0.131766 0.354851
Digestive_Deaths -0.102466 -0.013309
Cirrhosis_Deaths -0.004083 -0.084571
Kidney_Deaths 0.238568 -0.130608
Cardiovascular_Deaths -0.069920 0.144762
Congenital_Deaths -0.090481 -0.163793
Respiratory_Deaths -0.076192 0.086761
Preterm_Deaths -0.084886 -0.221464
Heat_Cold_Deaths -0.275361 0.125037
Sepsis_Deaths 0.034137 -0.287823
Nature_Deaths -0.124072 0.033323
Diabetes_Deaths 0.524986 -0.306077
Encephalopathy_Deaths 0.027057 -0.265309
Meningitis_Deaths -0.045498 -0.004897
Other_Neonatal_Deaths -0.070486 -0.265279
Whooping_Cough_Deaths -0.000840 0.023613
Diarrheal_Deaths -0.003502 0.130329
Fire_Heat_Deaths -0.076153 0.056770
Road_Deaths 0.066890 0.188621
Tuberculosis_Deaths 0.300646 0.298881
HIV_AIDS_Deaths -0.472547 0.074565
Drowning_Deaths 0.021078 0.156610
Malaria_Deaths 0.005307 0.110629
Syphilis_Deaths 0.234327 -0.102497
In [30]:
plt.figure(figsize=(14, 10))
for i in range(6):
plt.scatter(pca_out[:, i], pca_out[:, i + 1], label=f'PC{i+1} vs PC{i+2}', s=5)
plt.xlabel('Principal Component')
plt.ylabel('Principal Component')
plt.title('Scatter Plot of the First Seven Principal Components')
plt.legend()
plt.show()
SVD¶
In [31]:
U, s, V = np.linalg.svd(df_scaled, full_matrices=False)
In [32]:
np.round(V.T, 3)
Out[32]:
array([[ 0.094, 0.28 , -0.463, 0.098, -0.311, 0.15 , -0.16 , 0.169,
-0.019, 0.069, -0.101, -0.211, 0.041, 0.095, 0.002, -0.224,
-0.349, 0.434, 0.163, 0.097, 0.001, 0.075, 0.095, -0.098,
-0.088, -0.045, -0.002, -0.108, -0.046],
[ 0.192, -0.223, -0.146, 0.056, -0.035, 0.211, -0.074, 0.163,
0.252, 0.142, 0.175, 0.048, 0.298, 0.036, -0.169, -0.378,
0.593, 0.092, 0.103, -0.174, -0.034, 0.051, -0.083, 0.103,
0.058, -0.05 , 0.057, 0.022, 0.029],
[ 0.186, 0.099, 0.229, -0.031, 0.091, -0.162, 0.42 , 0.2 ,
0.001, 0.167, -0.187, -0.241, 0.592, -0.232, -0.066, 0.116,
-0.133, 0.08 , 0.055, -0.127, -0.044, -0.009, -0.094, -0.186,
0.061, 0.032, -0.039, -0.112, -0.03 ],
[ 0.181, 0.072, 0.356, 0.059, -0.016, 0.182, -0.073, 0.11 ,
-0.226, 0.131, 0.092, 0.08 , -0.225, 0.034, 0.019, -0.044,
-0.038, 0.085, 0.091, -0.237, -0.216, 0.078, -0.303, 0.178,
-0.4 , 0.138, -0.28 , -0.367, 0.055],
[ 0.189, -0.308, -0.059, 0.002, 0.087, 0.085, 0.004, -0.014,
0.241, -0.111, 0.198, -0.09 , 0.089, 0.04 , 0.02 , 0.39 ,
-0.174, 0.079, -0.068, -0.093, 0.08 , 0.037, 0.207, 0.143,
-0.389, -0.351, 0.316, -0.234, 0.146],
[ 0.191, 0.195, 0.058, -0.001, -0.004, -0.132, 0.355, -0.324,
0.327, 0.191, 0.114, 0.356, -0.248, 0.188, -0.295, 0.098,
-0.022, 0.231, 0.272, 0.053, -0.116, 0.123, 0.016, -0.142,
0.087, -0.055, 0.052, 0.008, -0.067],
[ 0.204, -0.155, -0.155, 0.032, -0.092, 0.102, -0.013, 0.087,
0.031, -0.25 , -0.181, 0.234, 0.07 , 0.229, 0.338, 0.249,
0.098, -0.091, 0.028, -0.055, -0.379, -0.051, 0.15 , -0.22 ,
0.307, -0.024, -0.262, -0.293, -0.095],
[ 0.208, 0.142, 0.12 , 0.021, -0.116, 0.004, -0.085, 0.036,
-0.175, -0.093, -0.353, 0.221, 0.12 , 0.015, 0.054, -0.016,
-0.04 , -0.082, 0.05 , -0.092, -0.358, 0.196, 0.003, 0.24 ,
-0.148, -0.235, 0.327, 0.486, -0.075],
[ 0.204, -0.061, -0.144, -0.068, -0.049, -0.239, -0.131, -0.175,
0.109, -0.116, -0.498, 0.007, -0.13 , -0.074, -0.144, -0.133,
0.078, -0.278, 0.13 , -0.321, 0.344, 0.128, 0.013, -0.2 ,
-0.228, 0.124, 0.026, -0.201, -0.041],
[ 0.153, -0.311, -0.171, 0.037, -0.037, 0.07 , 0.145, -0.568,
-0.467, 0.356, -0.084, -0.287, 0.042, 0.126, 0.047, 0.018,
0.094, 0.004, 0.063, 0.055, -0.101, -0.098, 0.05 , 0.074,
-0.003, 0.017, 0.019, 0.005, -0.028],
[ 0.211, -0.182, -0.025, 0.019, -0.016, 0.09 , -0.164, 0.012,
-0.023, -0.044, 0.002, 0.185, 0.062, -0.152, -0.348, 0.228,
-0.129, 0.251, -0.128, -0.167, 0.105, -0.303, 0.229, 0.096,
-0.059, 0.271, -0.333, 0.332, -0.251],
[ 0.222, -0.003, 0.048, 0.042, -0.054, 0.076, 0.087, 0.051,
0.051, 0.124, -0.053, -0.044, -0.138, 0.042, 0.152, -0.023,
-0.017, 0.014, -0.123, -0.111, 0.086, 0.053, 0.145, -0.227,
0.009, -0.014, -0.242, 0.349, 0.754],
[ 0.213, 0.004, 0.175, 0.018, -0.027, 0.085, -0.221, -0.011,
-0.093, -0.045, -0.005, 0.081, -0.052, -0.175, -0.11 , -0.038,
-0.032, 0.092, -0.024, 0.109, -0.138, -0.254, 0.071, -0.148,
0.22 , 0.408, 0.586, -0.214, 0.257],
[ 0.201, -0.151, 0.138, 0.088, -0.018, 0.275, 0.125, 0.239,
-0.012, 0.136, 0.209, -0.082, -0.139, 0.169, 0.227, -0.016,
-0.089, -0.229, 0.069, 0.035, 0.207, 0.148, 0.03 , -0.385,
-0.106, 0.203, 0.198, 0.237, -0.404],
[ 0.205, 0.182, 0.101, -0.021, 0.034, -0.034, -0.288, 0.005,
-0.011, 0.111, -0.103, 0.094, 0.205, 0.031, 0.066, 0.277,
0.31 , -0.008, 0.16 , 0.647, 0.23 , 0.002, -0.026, -0.036,
-0.236, 0. , -0.139, -0.012, 0.045],
[ 0.022, -0.025, 0.033, -0.934, -0.321, 0.124, 0.033, 0.018,
-0.017, 0.03 , 0.068, -0.001, 0.003, 0.003, 0.008, 0.007,
0.001, 0.002, 0.001, 0.002, -0.002, -0.001, -0.003, -0.003,
-0.004, -0.003, -0.001, 0.004, -0. ],
[ 0.151, -0.186, -0.211, -0.15 , 0.292, -0.525, -0.306, 0.213,
-0.026, 0.468, 0.123, 0.166, -0.075, -0.054, 0.196, -0.039,
-0.166, 0.035, -0.055, -0.043, -0.117, 0.058, -0.067, -0.01 ,
0.078, -0.03 , 0.023, 0.018, -0.04 ],
[ 0.206, 0.058, 0.059, -0.01 , 0.002, -0.027, -0.265, -0.345,
0.302, -0.19 , 0.221, -0.387, 0.035, -0.277, 0.139, 0.052,
-0.053, -0.012, 0.033, 0.009, -0.222, 0.415, -0.057, 0.067,
0.132, 0.218, -0.137, 0.112, -0.068],
[ 0.2 , 0.192, -0.152, 0.054, -0.146, 0.045, -0.005, -0.198,
0.229, 0.005, 0.087, 0.047, 0.034, -0.07 , 0.285, 0.027,
-0.092, -0.12 , -0.05 , -0.115, 0.093, -0.559, -0.542, -0.002,
-0.03 , -0.122, 0.048, 0.108, -0.036],
[ 0.186, 0.096, 0.326, 0.006, 0.037, 0.07 , -0.265, -0.06 ,
-0.176, 0.013, 0.068, -0.127, -0.155, -0.189, -0.11 , -0.122,
0.044, 0.046, -0.005, -0.086, 0.119, -0.098, 0.149, -0.244,
0.266, -0.635, -0.081, -0.079, -0.165],
[ 0.217, 0.096, 0.122, 0.005, 0.003, 0.001, 0.024, 0.063,
-0.04 , 0.011, -0.015, 0.01 , 0.025, 0.24 , 0.17 , 0.033,
-0.133, -0.022, 0.234, -0.121, 0.457, 0.06 , 0.088, 0.565,
0.44 , 0.085, 0.023, -0.074, 0.032],
[ 0.205, 0.223, -0.033, 0.042, -0.123, 0.004, 0.13 , -0.078,
0.001, 0.122, -0.045, 0.106, -0.004, 0.025, 0.051, -0.047,
0.128, 0.072, -0.825, 0.048, 0.118, 0.224, 0.04 , 0.085,
0.023, 0.035, 0.047, -0.172, -0.151],
[ 0.218, -0.06 , -0.051, 0.028, -0.029, 0.076, 0.057, 0.008,
0.077, 0.033, 0.111, 0.094, 0.125, -0.008, -0.314, -0.326,
-0.379, -0.577, -0.048, 0.321, -0.139, -0.057, 0.111, 0.132,
-0.004, -0.066, -0.176, -0.107, 0.072],
[ 0.182, -0.251, -0.168, 0.015, -0.015, -0.067, 0.189, -0.021,
-0.383, -0.417, 0.194, 0.287, 0.052, -0.233, -0.011, -0.081,
-0.078, 0.18 , 0.046, 0.151, 0.216, 0.267, -0.348, -0.101,
0.066, -0.072, -0.019, 0.017, 0.088],
[ 0.206, 0.118, 0.014, -0.038, 0.044, -0.301, 0.299, 0.042,
-0.028, -0.235, 0.15 , -0.06 , -0.135, -0.184, 0.319, -0.323,
0.197, 0.092, 0.102, 0.081, -0.07 , -0.275, 0.422, 0.136,
-0.253, 0.058, -0.034, -0.002, -0.091],
[ 0.067, 0.195, -0.223, -0.218, 0.743, 0.473, 0.075, -0.07 ,
0.014, -0.02 , -0.186, 0.068, -0.052, -0.122, 0.071, -0.082,
-0.038, 0.044, 0.006, 0.004, -0.002, 0.019, -0. , 0.015,
-0.002, 0.012, -0. , 0.003, -0.015],
[ 0.178, -0.319, 0.048, -0.014, 0.006, -0.021, 0.157, 0.274,
0.205, -0.06 , -0.372, -0.329, -0.427, -0.02 , -0.13 , 0.023,
0.043, 0.142, -0.076, 0.311, -0.092, -0.08 , -0.27 , 0.183,
0.119, -0.073, -0.012, 0.047, -0.063],
[ 0.13 , 0.319, -0.386, 0.032, -0.07 , -0.005, 0.111, 0.258,
-0.23 , 0.052, 0.204, -0.155, -0.227, -0.227, -0.245, 0.411,
0.244, -0.304, 0.081, -0.12 , -0.071, 0.04 , 0.015, 0.07 ,
0.074, -0.002, 0.052, 0.005, 0.041],
[ 0.194, 0.157, 0.002, -0.119, 0.261, -0.234, -0.102, 0.021,
-0.145, -0.333, 0.155, -0.267, 0.091, 0.637, -0.249, -0.046,
0.032, 0.022, -0.135, -0.062, -0.063, -0.101, -0.137, -0.122,
-0.042, 0.048, 0.013, 0.092, 0.029]])
In [33]:
pca_out = pca.transform(df_scaled)
pca_out
Out[33]:
array([[ 1.05030157, 0.32806136, -0.26212573, ..., -0.27747282,
0.35391138, 0.01280102],
[ 1.13209969, 0.36087074, -0.26075422, ..., -0.38430763,
0.31305797, -0.04196272],
[ 1.3725952 , 0.42210871, -0.23694544, ..., -0.32019865,
0.36630812, -0.15413755],
...,
[-0.49005491, -0.1667606 , -0.07166429, ..., 0.36594767,
-0.11585224, 0.0633935 ],
[-0.51580557, -0.15763107, -0.09296948, ..., 0.32501016,
-0.08830259, 0.05705652],
[-0.52409014, -0.14791434, -0.10220481, ..., 0.25663525,
-0.09838061, 0.05373432]])
In [34]:
def fit_svd(X, M=1):
U, s, V = np.linalg.svd(X, full_matrices=False)
return U[:, :M] @ (np.diag(s[:M]) @ V[:M, :])
In [35]:
df_imputed = df_scaled.copy()
In [36]:
row_index = np.random.choice(len(df_imputed), size=20, replace=False)
column_index = np.random.choice(df_imputed.shape[1], size=20)
In [37]:
Xhat = df_imputed.copy()
xbar = np.nanmean(df_imputed, axis=0)
Xhat[row_index, column_index] = xbar[column_index]
In [38]:
thresh = 1e-7
rel_err = 1
iter_ = 0
ismiss = np.isnan(df_imputed)
Xscaled = (df_imputed - xbar) / np.sqrt(np.sum(~ismiss, axis=0))
Xscaled_nomiss = Xscaled[~ismiss]
mssold = np.mean(np.square(Xscaled_nomiss))
mss0 = np.mean(np.square(df_imputed[~ismiss]))
In [39]:
while rel_err > thresh:
iter_ += 1
Xapp = fit_svd(Xhat, M=1)
Xhat[ismiss] = Xapp[ismiss]
mss = np.mean(np.square(df_imputed[~ismiss] - Xapp[~ismiss]))
rel_err = (mssold - mss) / mss0
mssold = mss
print(f"Iter: {iter_}, MSS: {mss}, Rel. Err: {rel_err}")
Iter: 1, MSS: 0.3201319244821912, Rel. Err: -0.31996852578938073
In [40]:
U, s, V = np.linalg.svd(df_scaled, full_matrices=False)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(s) + 1), s ** 2, marker='o', linestyle='-')
plt.title('Scree Plot')
plt.xlabel('Component Index')
plt.ylabel('Eigenvalue')
plt.grid(True)
plt.show()
cumulative_variance_explained = np.cumsum(s ** 2) / np.sum(s ** 2)
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(s) + 1), cumulative_variance_explained, marker='o', linestyle='-')
plt.title('Cumulative Variance Explained Plot')
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Proportion of Variance Explained')
plt.grid(True)
plt.show()
Clustering¶
K Means¶
In [41]:
inertia_values = []
k_values = range(2, 30)
for k in k_values:
model = KMeans(n_clusters=k, random_state=42)
model.fit(df_scaled)
inertia_values.append(model.inertia_)
plt.figure(figsize=(10, 6))
plt.plot(k_values, inertia_values, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(k_values)
plt.grid(True)
plt.show()
In [42]:
kmeans_2 = KMeans(n_clusters=2, random_state=42)
kmeans_2.fit(df_scaled)
labels_2 = kmeans_2.labels_
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_scaled[:, 0], df_scaled[:, 1], c=labels_2)
ax.set_title("K-Means Clustering Results with K=2")
plt.show()
In [43]:
kmeans_17 = KMeans(n_clusters=17, random_state=42)
kmeans_17.fit(df_scaled)
labels_17 = kmeans_17.labels_
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_scaled[:, 0], df_scaled[:, 1], c=labels_17)
ax.set_title("K-Means Clustering Results with K=17")
plt.show()
In [44]:
kmeans_29 = KMeans(n_clusters=29, random_state=42)
kmeans_29.fit(df_scaled)
labels_29 = kmeans_29.labels_
fig, ax = plt.subplots(figsize=(8, 8))
ax.scatter(df_scaled[:, 0], df_scaled[:, 1], c=labels_29)
ax.set_title("K-Means Clustering Results with K=29")
plt.show()
In [45]:
kmeans_2_1 = KMeans(n_clusters=2, random_state=3, n_init=1)
kmeans_2_1.fit(df_scaled)
inertia_2_1 = kmeans_2_1.inertia_
kmeans_2_20 = KMeans(n_clusters=2, random_state=3, n_init=20)
kmeans_2_20.fit(df_scaled)
inertia_2_20 = kmeans_2_20.inertia_
print("Inertia for KMeans with 2 clusters and n_init=1:", inertia_2_1)
print("Inertia for KMeans with 2 clusters and n_init=20:", inertia_2_20)
kmeans_17_1 = KMeans(n_clusters=17, random_state=3, n_init=1)
kmeans_17_1.fit(df_scaled)
inertia_17_1 = kmeans_17_1.inertia_
kmeans_17_20 = KMeans(n_clusters=17, random_state=3, n_init=20)
kmeans_17_20.fit(df_scaled)
inertia_17_20 = kmeans_17_20.inertia_
print("Inertia for KMeans with 17 clusters and n_init=1:", inertia_17_1)
print("Inertia for KMeans with 17 clusters and n_init=20:", inertia_17_20)
kmeans_29_1 = KMeans(n_clusters=29, random_state=3, n_init=1)
kmeans_29_1.fit(df_scaled)
inertia_29_1 = kmeans_29_1.inertia_
kmeans_29_20 = KMeans(n_clusters=29, random_state=3, n_init=20)
kmeans_29_20.fit(df_scaled)
inertia_29_20 = kmeans_29_20.inertia_
print("Inertia for KMeans with 29 clusters and n_init=1:", inertia_29_1)
print("Inertia for KMeans with 29 clusters and n_init=20:", inertia_29_20)
Inertia for KMeans with 2 clusters and n_init=1: 92144.82758027197 Inertia for KMeans with 2 clusters and n_init=20: 92144.82758027197 Inertia for KMeans with 17 clusters and n_init=1: 15774.632372538665 Inertia for KMeans with 17 clusters and n_init=20: 14187.884503011679 Inertia for KMeans with 29 clusters and n_init=1: 7970.138918026145 Inertia for KMeans with 29 clusters and n_init=20: 7837.581214293692
Hierarchical Clustering¶
In [46]:
numeric_df = df.drop(columns=['Entity', 'Code', 'Year'])
In [47]:
linkage_methods = ['single', 'complete', 'average']
def plot_dendrograms_sklearn(data, axes, title):
for method, ax in zip(linkage_methods, axes.flatten()[:-1]):
hc = AgglomerativeClustering(distance_threshold=0, n_clusters=None, linkage=method)
hc.fit(data)
Z = linkage(data, method=method)
dendrogram(Z, ax=ax, truncate_mode='level', p=5, color_threshold=5, above_threshold_color='black')
ax.set_title(f'Linkage Method: {method.capitalize()}')
plt.suptitle(title, fontsize=20)
plt.tight_layout(rect=[0, 0, 1, 0.96])
def plot_dendrogram_centroid(data, ax):
Z = linkage(data, method='centroid')
dendrogram(Z, ax=ax, truncate_mode='level', p=5, color_threshold=5, above_threshold_color='black')
ax.set_title('Linkage Method: Centroid')
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
plot_dendrograms_sklearn(numeric_df, axes, "Hierarchical Clustering with Original Data")
plot_dendrogram_centroid(numeric_df, axes.flatten()[-1])
plt.show()
fig, axes = plt.subplots(2, 2, figsize=(20, 15))
plot_dendrograms_sklearn(df_scaled, axes, "Hierarchical Clustering with Scaled Data")
plot_dendrogram_centroid(df_scaled, axes.flatten()[-1])
plt.show()
In [48]:
def cut_and_print_clusters(data, Z, n_clusters, entity_column='Entity'):
clusters = cut_tree(Z, n_clusters=n_clusters).flatten()
data['Cluster'] = clusters
for cluster in range(n_clusters):
print(f"\nCluster {cluster + 1}:")
print(data[data['Cluster'] == cluster][['Cluster', entity_column]])
print("Original Data Clusters:")
Z_complete_original = linkage(numeric_df, method='complete')
cut_and_print_clusters(df.copy(), Z_complete_original, n_clusters=5
)
print("\nScaled Data Clusters:")
Z_complete_scaled = linkage(df_scaled, method='complete')
scaled_df_with_clusters = pd.DataFrame(df_scaled, columns=numeric_df.columns)
scaled_df_with_clusters['Entity'] = df['Entity'].values
cut_and_print_clusters(scaled_df_with_clusters, Z_complete_scaled, n_clusters=5, entity_column='Entity')
Original Data Clusters:
Cluster 1:
Cluster Entity
0 0 Afghanistan
1 0 Afghanistan
2 0 Afghanistan
3 0 Afghanistan
4 0 Afghanistan
... ... ...
6835 0 Zimbabwe
6836 0 Zimbabwe
6837 0 Zimbabwe
6838 0 Zimbabwe
6839 0 Zimbabwe
[6050 rows x 2 columns]
Cluster 2:
Cluster Entity
1140 1 China
1141 1 China
1142 1 China
1143 1 China
1144 1 China
1145 1 China
1146 1 China
1147 1 China
1148 1 China
1149 1 China
2692 1 India
2693 1 India
2694 1 India
2695 1 India
2696 1 India
2697 1 India
2698 1 India
2699 1 India
Cluster 3:
Cluster Entity
2670 2 India
2671 2 India
2672 2 India
2673 2 India
2674 2 India
2675 2 India
2676 2 India
2677 2 India
2678 2 India
2679 2 India
2680 2 India
2681 2 India
2682 2 India
Cluster 4:
Cluster Entity
2683 3 India
2684 3 India
2685 3 India
2686 3 India
2687 3 India
2688 3 India
2689 3 India
2690 3 India
2691 3 India
Cluster 5:
Cluster Entity
4170 4 Nigeria
4171 4 Nigeria
4172 4 Nigeria
4173 4 Nigeria
4174 4 Nigeria
4175 4 Nigeria
4176 4 Nigeria
4177 4 Nigeria
4178 4 Nigeria
4179 4 Nigeria
4180 4 Nigeria
4181 4 Nigeria
4182 4 Nigeria
4183 4 Nigeria
4184 4 Nigeria
4185 4 Nigeria
4186 4 Nigeria
4187 4 Nigeria
4188 4 Nigeria
4189 4 Nigeria
4190 4 Nigeria
4191 4 Nigeria
4192 4 Nigeria
4193 4 Nigeria
4194 4 Nigeria
4195 4 Nigeria
4196 4 Nigeria
4197 4 Nigeria
4198 4 Nigeria
4199 4 Nigeria
Scaled Data Clusters:
Cluster 1:
Cluster Entity
0 0 Afghanistan
1 0 Afghanistan
2 0 Afghanistan
3 0 Afghanistan
4 0 Afghanistan
... ... ...
6115 0 Zimbabwe
6116 0 Zimbabwe
6117 0 Zimbabwe
6118 0 Zimbabwe
6119 0 Zimbabwe
[6082 rows x 2 columns]
Cluster 2:
Cluster Entity
421 1 Bangladesh
2360 1 Haiti
2504 1 Indonesia
2520 1 Iran
3678 1 Myanmar
4125 1 Pakistan
Cluster 3:
Cluster Entity
1110 2 China
1111 2 China
1112 2 China
1113 2 China
1114 2 China
1115 2 China
1116 2 China
1117 2 China
1118 2 China
1119 2 China
1120 2 China
Cluster 4:
Cluster Entity
2460 3 India
2461 3 India
2462 3 India
2463 3 India
2464 3 India
2465 3 India
2466 3 India
2467 3 India
2468 3 India
2469 3 India
Cluster 5:
Cluster Entity
2470 4 India
2471 4 India
2472 4 India
2473 4 India
2474 4 India
2475 4 India
2476 4 India
2477 4 India
2478 4 India
2479 4 India
2480 4 India
In [ ]: